Word Associations, Reported Content

It turns out that words like the n-word and TERF are appearing very frequently in the holyoke and smith confessional respectively. We want to dig deeper into how these and other controversial words are being used.

  • What words appear in the same secret as the words above?
  • What kinds of secrets are reported, and how might they overlap the above words?
  • What word associations can be found in the corpus?
In [5]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import functools

from os import path
from scipy.ndimage import imread
from nltk.util import ngrams
from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from IPython.display import display

import cufflinks as cf
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()

plt.style.use('ggplot')
%matplotlib inline
In [155]:
# Reading in data
holyc_df = pd.read_csv('../tmp/clean/holyokecon_confessional_comments.csv')
holyr_df = pd.read_csv('../tmp/clean/holyokecon_confessional_reports.csv')
holys_df = pd.read_csv('../tmp/clean/holyokecon_confessional_secrets.csv')
holyraw_df = pd.read_csv('../tmp/raw/holyokecon_confessional_secrets.csv')
holyrawr_df = pd.read_csv('../tmp/raw/holyokecon_confessional_reports.csv')

# defining some global variables
SECRET_COL = 'clean_tokens_secret'
REPORT_COL = 'clean_tokens_report'
In [175]:
holysr_df = holys_df.merge(holyr_df, left_on='id', right_on="secret_id",
                          how='left', suffixes=('_secret', '_report'))
holysr_df = holysr_df.merge(holyraw_df[['id', 'create_date', 'confession']],
                            left_on='id_secret', right_on='id', how='left')
holysr_df = holysr_df.merge(holyrawr_df[['id', 'reason']],
                            left_on='id_report', right_on='id', how='left')
holysr_df.rename(columns={'reason': 'report_reason'}, inplace=True)
#preprocess: remove rows with null clean_tokens_secret value
holysr_df = holysr_df[holysr_df[SECRET_COL].notnull()]
holysr_df.head()
Out[175]:
id_secret comments clean_tokens_secret id_report clean_tokens_report secret_id comment_id id_x create_date confession id_y report_reason
0 14040 25 goddamn insomnia NaN NaN NaN NaN 14040 1265857560 goddamn insomnia. NaN NaN
1 13994 19 sleep keep secret NaN NaN NaN NaN 13994 1265857560 GO TO SLEEP. KEEP YOUR SECRETS TO YOURSELF. NaN NaN
2 10971 15 accident waiting happen 2120495 wrong thread 10971 2120493 10971 1265857560 we are accidents waiting to happen 2120495 Wrong thread
3 12515 31 site ruining life NaN NaN NaN NaN 12515 1265857560 Is this site ruining your life? NaN NaN
4 9854 10 kick dont believe 1017 troll 9854 90928 9854 1265857560 I just do it for kicks, and I don't believe an... 1017 troll.
In [235]:
# detecting secrets containing a specific word
pattern = r'gay|lesbian|bisex' # niggar|nigger|asian|yellow|latino|white|gay|lesbian|trans|bi
selector = holysr_df[SECRET_COL].str.contains(pattern)
match_df = holysr_df[selector]

# Drop duplicate secrets
match_secrets = match_df.drop_duplicates('id_secret')

# Match not reported
match_not_reported = match_secrets[match_secrets['id_report'].isnull()]

# Match reported
match_reported = match_secrets[match_secrets['id_report'].notnull()]

# Select report text
report_text = match_df[match_df[REPORT_COL].notnull()]
In [236]:
word_cloud_options = {
    'width': 800, 
    'height': 800,
    'background_color': "white", 
    'max_words': 500, 
    'stopwords': STOPWORDS,
    'random_state': 42   
}

def create_word_cloud(text_iterable, image_color_fp=None,
                      title='', **kwargs):
    confesh_coloring = imread(image_color_fp)
    kwargs.update({'mask': confesh_coloring})
    wc = WordCloud(**kwargs)
    text = " ".join(text_iterable)
    wc.generate(text)

    image_colors = ImageColorGenerator(confesh_coloring)

    plt.figure(figsize=(8,8))
    plt.title(title)
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis("off")
    plt.show()
    
logo_fp = '../assets/logo2.png'
# Word Cloud of Match
create_word_cloud(match_secrets[SECRET_COL].astype(str),
                  logo_fp, title="Holyoke Secrets Containing the word %s" % pattern,
                  **word_cloud_options)
In [237]:
# Defining functions to compute word frequency
def word_counter(text, n=1, length_thres=50):
    t = text.split()
    t = [tk for tk in t if len(tk) < length_thres]
    for i in range(n):
        t_ngrams = [" ".join(b) for b in list(ngrams(t, i + 1))]
        t.extend(t_ngrams)
    return Counter(t)

def word_aggregater(corpus_list, n=1):
    c = Counter()
    for doc in corpus_list:
        c.update(word_counter(doc, n=n))
    return c

def count_token_frequency(token_series, filter_thres, **kwargs):
    freq_df =  pd.DataFrame(word_aggregater(token_series, **kwargs).items())
    freq_df.rename(columns={0: 'word', 1: 'frequency'}, inplace=True)
    freq_df = freq_df[freq_df['frequency'] > filter_thres] \
        .sort_values('frequency', ascending=False)
    freq_df['ngrams'] = freq_df['word'].apply(lambda x: len(x.split()))
    return freq_df.reset_index(drop=True)

# create frequency count dataframes
secrets_corpus = count_token_frequency(match_secrets['clean_tokens_secret'], 0, n=3)
secrets_not_reported_corpus = count_token_frequency(match_not_reported['clean_tokens_secret'], 0, n=3)
secrets_reported_corpus = count_token_frequency(match_reported['clean_tokens_secret'], 0, n=3)
report_text_corpus = count_token_frequency(report_text['clean_tokens_secret'], 0, n=3)
In [260]:
# merge frequencies for all secrets, reported, and not reported
merge_cols = ['word', 'frequency']
all_corpus = secrets_corpus.merge(secrets_not_reported_corpus[merge_cols], on="word", 
                                  how="left", suffixes=("_all", "_not_reported"))
all_corpus = all_corpus.merge(secrets_reported_corpus[merge_cols], on="word", how="left")
all_corpus = all_corpus.rename(columns={'frequency': 'frequency_reported'})
all_corpus.head()
Out[260]:
word frequency_all ngrams frequency_not_reported frequency_reported
0 gay 2352 1 2168 184
1 lesbian 1452 1 1392 60
2 like 1214 1 944 270
3 dont 972 1 698 274
4 girl 902 1 700 202
In [261]:
# sanity check on the word frequency counter:
# since not_reported and reported secrets should
# be mutually exclusive, frequency_all should equal
# the sum of frequency_not_reported and frequency_reported

secret_sum = all_corpus[['frequency_not_reported', 'frequency_reported']].sum(axis=1)
not_equal = all_corpus[~(secret_sum == all_corpus['frequency_all'])]
print not_equal.shape[0]
0

Sanity check passed!

In [270]:
# creating custom annotations for the plot
# when you hover over a specific bar on the plot,
# you should be able to see the top 4 posts
# containing that word, sorted by number of comments

def format_text_annotation(text_list, n=60):
    text_list = [t.decode('utf-8').encode('ascii', 'ignore') for t in text_list]
    text_list = [" ".join(t.split()) for t in text_list]
    text_list = "<br>".join([t if len(t) < n else t[:n] + "..." for t in text_list])
    return text_list

def token_top_secrets(token, comment_col='comments', n=5):
    top_secrets = holysr_df[holysr_df[SECRET_COL].str.contains(token)] \
        .sort_values(comment_col, ascending=False)['confession']
    top_secrets = top_secrets.drop_duplicates().tolist()
    if len(top_secrets) < n:
        n = len(top_secrets)
    return format_text_annotation(top_secrets[:n])

def token_reports_text(token, comment_col='comments', n=5):
    top_reports = report_text[report_text[SECRET_COL].str.contains(token)] \
        .sort_values(comment_col, ascending=False)['confession']
    top_reports = top_reports.drop_duplicates().tolist()
    if len(top_reports) < n:
        n = len(top_reports)
    return format_text_annotation(top_reports[:n])

# filter all_corpus to pick top n tokens for each ngram
n = 25
all_corpus = pd.concat([
    all_corpus[all_corpus['ngrams'] == 1].sort_values('frequency_all', ascending=False)[:n],
    all_corpus[all_corpus['ngrams'] == 2].sort_values('frequency_all', ascending=False)[:n],
    all_corpus[all_corpus['ngrams'] == 3].sort_values('frequency_all', ascending=False)[:n]
])
all_corpus['top_secrets'] = all_corpus['word'].apply(token_top_secrets)
all_corpus['top_reports'] = all_corpus['word'].apply(token_reports_text)
all_corpus.head()
Out[270]:
word frequency_all ngrams frequency_not_reported frequency_reported top_secrets top_reports
0 gay 2352 1 2168 184 Okay, we've done teams. let's do majors. stere... Okay, we've done teams. let's do majors. stere...
1 lesbian 1452 1 1392 60 WTF! Why are moho girls so damn horny?!?!?! li... WTF! Why are moho girls so damn horny?!?!?! li...
2 like 1214 1 944 270 Let's give this a go: RATE MY BODY! Post a pic... Okay, we've done teams. let's do majors. stere...
3 dont 972 1 698 274 Bringing back an oldie. Paste whatever is in y... Okay, we've done teams. let's do majors. stere...
4 girl 902 1 700 202 wTF with the RAOS girls? why are they always r... WTF! Why are moho girls so damn horny?!?!?! li...
In [271]:
def create_bar_trace(dataframe, graph_obj, x_col, y_col, text_col, **go_kwargs):
    return graph_obj(
        y=dataframe[x_col],
        x=dataframe[y_col],
        text=dataframe[text_col],
        **go_kwargs)

def create_word_freq_subplot(dataframe, ngrams=1):
    dataframe = dataframe[dataframe['ngrams'] == ngrams].copy()
    dataframe.sort_values('frequency_all', inplace=True, ascending=False)
    trace1 = create_bar_trace(dataframe, go.Bar,
                              'frequency_not_reported', 'word', 'top_secrets',
                              name='<b>Not Reported</b>',
                              marker={'color': '#bc94d3'},
                              showlegend=False)
    trace2 = create_bar_trace(dataframe, go.Bar,
                              'frequency_reported', 'word', 'top_reports',
                              name='<b>Reported</b>',
                              marker={'color': '#8551a3'},
                              showlegend=False)
    data = [trace1, trace2]
    return data
    
def add_subplot_fig(fig, row, col, traces):
    for t in traces:
        fig.append_trace(t, row, col)
    return fig

subplot1 = create_word_freq_subplot(all_corpus, ngrams=1)
subplot2 = create_word_freq_subplot(all_corpus, ngrams=2)
subplot3 = create_word_freq_subplot(all_corpus, ngrams=3)
fig = tools.make_subplots(rows=3, cols=1, 
                          subplot_titles=('Unigrams', 'Bigrams', 'Trigrams'),
                          vertical_spacing = 0.12)

add_subplot_fig(fig, 1, 1, subplot1)
add_subplot_fig(fig, 2, 1, subplot2)
add_subplot_fig(fig, 3, 1, subplot3)

title = 'Frequency of Words/Phrases in Confessions Containing %s' \
    % ", ".join(["\"" + p + "\"" for p in pattern.split('|')])
xaxis_domain = fig['layout']['xaxis1']['domain']
fig['layout'].update(
    {
        'title': title,
        'titlefont': {'size': 20},
        'height': 1200,
        'width': 1000,
        'barmode': 'stack',
        'margin': {'l': 100, 'r': 100, 'b': 125, 't': 100, 'pad': 10},
        'xaxis1': {
            'tickangle': -45
        },
        'xaxis2': {
            'tickangle': -45
        },
        'xaxis3': {
            'tickangle': -45
        }
    }
)
iplot(fig)
This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]

Drawing...